libname Input "Input Data";
libname Output "Output Data";
data _null_;
	x 'cd Input Data';
run;

data Contracts;
	infile 'GovContracts_2000_2020.csv' delimiter = ',' MISSOVER DSD lrecl = 32767 firstobs = 2;
	informat contract_transaction_unique_key $200.;
	informat contract_award_unique_key $200.;
	informat award_id_piid $200.;
	informat modification_number $200.;
	informat transaction_number BEST32.;
	informat parent_award_id_piid $200.;
	informat parent_award_modification_number $200.;
	informat federal_action_obligation BEST32.;
	informat action_date YYMMDD10.;
	informat awarding_agency_code $200.;
	informat awarding_agency_name $200.;
	informat awarding_sub_agency_code $200.;
	informat awarding_sub_agency_name $200.;
	informat recipient_duns $200.;
	informat recipient_name $200.;
	informat recipient_parent_duns $200.;
	informat recipient_parent_name $200.;
	informat recipient_address_line_1 $200.;
	informat recipient_address_line_2 $200.;
	informat recipient_city_name $200.;
	informat recipient_county_name $200.;
	informat recipient_state_code $200.;
	informat recipient_zip_4_code $200.;
	informat award_type_code $200.;
	informat award_type $200.;
	informat naics_code BEST32.;
	informat number_of_offers_received BEST32.;
	
	format contract_transaction_unique_key $200.;
	format contract_award_unique_key $200.;
	format award_id_piid $20.;
	format modification_number $20.;
	format transaction_number BEST12.;
	format parent_award_id_piid $20.;
	format parent_award_modification_number $20.;
	format federal_action_obligation BEST12.;
	format action_date YYMMDD10.;
	format awarding_agency_code $20.;
	format awarding_agency_name $200.;
	format awarding_sub_agency_code $20.;
	format awarding_sub_agency_name $200.;
	format recipient_duns $20.;
	format recipient_name $200.;
	format recipient_parent_duns $20.;
	format recipient_parent_name $200.;
	format recipient_address_line_1 $200.;
	format recipient_address_line_2 $200.;
	format recipient_city_name $50.;
	format recipient_county_name $50.;
	format recipient_state_code $20.;
	format recipient_zip_4_code $20.;
	format award_type_code $20.;
	format award_type $50.;
	format naics_code BEST12.;
	format number_of_offers_received BEST12.;

    input
		contract_transaction_unique_key $
		contract_award_unique_key $
		award_id_piid $
		modification_number $
		transaction_number
		parent_award_id_piid $
		parent_award_modification_number $
		federal_action_obligation
		action_date
		awarding_agency_code $
		awarding_agency_name $
		awarding_sub_agency_code $
		awarding_sub_agency_name $
		recipient_duns $
		recipient_name $
		recipient_parent_duns $
		recipient_parent_name $
		recipient_address_line_1 $
		recipient_address_line_2 $
		recipient_city_name $
		recipient_county_name $
		recipient_state_code $
		recipient_zip_4_code $
		award_type_code $
		award_type $
		naics_code
		number_of_offers_received;
run;



data Contracts_Comp;
	set Contracts (keep = award_id_piid parent_award_id_piid awarding_sub_agency_code recipient_name recipient_parent_name naics_code);
run;

data Contracts_Comp2;
	set Contracts_Comp;
	COMPANYNAME_GC = recipient_parent_name;
	if COMPANYNAME_GC = "" then COMPANYNAME_GC = recipient_name;	
	if COMPANYNAME_GC = "" then delete;
	Comp_Name = prxchange('s/\(([^\)]+)\)//i', -1, COMPANYNAME_GC);
	Comp_Name = upcase(Comp_Name);
	Comp_Name = tranwrd(Comp_Name,"LLC","");
	Comp_Name = tranwrd(Comp_Name,"INCORPORATED","");
	Comp_Name = tranwrd(Comp_Name,"INC","");
	Comp_Name = tranwrd(Comp_Name,"LTD","");
	Comp_Name = tranwrd(Comp_Name,"CORPORATION","");
	Comp_Name = tranwrd(Comp_Name,"CORPORATI","");
	Comp_Name = tranwrd(Comp_Name,"CORP","");
	Comp_Name = tranwrd(Comp_Name,"COMPANY","");
	Comp_Name = tranwrd(Comp_Name," CO","");
	Comp_Name = tranwrd(Comp_Name,"LP","");
	Comp_Name = tranwrd(Comp_Name,"INTERNATIONAL","");
	Comp_Name = tranwrd(Comp_Name,"INTL","");
	Comp_Name = tranwrd(Comp_Name,"HOLDING","");
	Comp_Name = tranwrd(Comp_Name,"HOLDINGS","");
	Comp_Name = tranwrd(Comp_Name," AND ","");
	Comp_Name = compress(Comp_Name,"ABCDEFGHIJKLMNOPQRSTUVWXYZ012345789","kis");
	Comp_Name = compress(Comp_Name);
run;

proc sort data = Contracts_Comp2 out = Output.Contracts_Comp (keep = COMPANYNAME_GC Comp_Name NAICS_Code) nodupkey; by Comp_Name; run;

